from pyspark.sql import SparkSession
from pyspark.sql.functions import when, col
from pyspark.sql.types import DoubleType

spark = SparkSession \
    .builder \
    .master('local') \
    .appName('HelloSpark') \
    .getOrCreate()

# df = spark.read.text('test.txt')
# df.show(truncate=False)

df = spark.read\
    .option("header", True)\
    .option('inferSchema', True)\
    .csv("dataset\\BeijingPM20100101_20151231.csv")

# 缺失值处理
# when NA = 0, otherwise cast to double
df.select('No', 'year', 'month',
          when(col('PM_Dongsi') == 'NA', 0).otherwise
          (col('PM_Dongsi').cast(DoubleType()).alias('pm'))) \
    .show(truncate=False)

# replace 替换
df.na.replace({'NA' : 'NaN'}, 'PM_Dongsi').show(truncate=False)

